Objectives: 1) Learn the basics of dplyr -Selecting, Filtering, and Arranging -Making a series of function with pipes %>% -Mutate new and existing columns -Wide and long tables 2) Learn the basics of ggplot2 -Setting up your dataframe for easy plotting -Plot lines, scatter plots, boxplots -Compare different subpopulations
Installing and Loading packages
#Comment out if you've already installed them
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
#Load libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
Loading your dataset
#setwd("~/Documents/Courses/Bootcamp/bootcamp_2020/")
PKdata <- read.csv("simtab.csv")
#Let's take a glimpse at your dataset
head(PKdata)
## ID PT_BIRTHDATE PT_USUBJID PT_ADDRESS TIME IPRED DV
## 1 1 09/20/1982 XDP5409 7385 San Francisco Drive 0 0.00000 0.00000
## 2 1 09/20/1982 XDP5409 7385 San Francisco Drive 1 0.35767 0.36910
## 3 1 09/20/1982 XDP5409 7385 San Francisco Drive 2 0.45474 0.49766
## 4 1 09/20/1982 XDP5409 7385 San Francisco Drive 4 0.43277 0.55543
## 5 1 09/20/1982 XDP5409 7385 San Francisco Drive 8 0.29634 0.28532
## 6 1 09/20/1982 XDP5409 7385 San Francisco Drive 12 0.19763 0.18520
## AMT HT BW SEX HIV IWRES CWRES PRED RES WRES DOSE
## 1 300 69 78.298 1 0 NA 0 NA 0.0000000 0 300
## 2 0 69 78.298 1 0 -10 0 0.36325 0.0058536 0 300
## 3 0 69 78.298 1 0 -10 0 0.47078 0.0268820 0 300
## 4 0 69 78.298 1 0 -10 0 0.46926 0.0861620 0 300
## 5 0 69 78.298 1 0 -10 0 0.35684 -0.0715120 0 300
## 6 0 69 78.298 1 0 -10 0 0.26497 -0.0797670 0 300
#Alternatively, click on PKdata in your Global Environment toolbar on the right
#Let's open up the data dictionary too, it's always good practice to know your dataset
Dplyr Cheatsheet https://raw.githubusercontent.com/rstudio/cheatsheets/master/data-transformation.pdf
Selecting
#You can select specific columns using select()
#Let's de-identify our dataset by selecting the columns we want
deidentified <- select(PKdata, ID, TIME, DV, DOSE, HT, BW, SEX, HIV)
head(deidentified)
## ID TIME DV DOSE HT BW SEX HIV
## 1 1 0 0.00000 300 69 78.298 1 0
## 2 1 1 0.36910 300 69 78.298 1 0
## 3 1 2 0.49766 300 69 78.298 1 0
## 4 1 4 0.55543 300 69 78.298 1 0
## 5 1 8 0.28532 300 69 78.298 1 0
## 6 1 12 0.18520 300 69 78.298 1 0
#You can also do this by "subtracting" the columns you don't want with -
deidentified <- select(PKdata, -PT_BIRTHDATE, -PT_ADDRESS, -PT_USUBJID)
head(deidentified)
## ID TIME IPRED DV AMT HT BW SEX HIV IWRES CWRES PRED
## 1 1 0 0.00000 0.00000 300 69 78.298 1 0 NA 0 NA
## 2 1 1 0.35767 0.36910 0 69 78.298 1 0 -10 0 0.36325
## 3 1 2 0.45474 0.49766 0 69 78.298 1 0 -10 0 0.47078
## 4 1 4 0.43277 0.55543 0 69 78.298 1 0 -10 0 0.46926
## 5 1 8 0.29634 0.28532 0 69 78.298 1 0 -10 0 0.35684
## 6 1 12 0.19763 0.18520 0 69 78.298 1 0 -10 0 0.26497
## RES WRES DOSE
## 1 0.0000000 0 300
## 2 0.0058536 0 300
## 3 0.0268820 0 300
## 4 0.0861620 0 300
## 5 -0.0715120 0 300
## 6 -0.0797670 0 300
#You can also search through your columns with helpers: contains(), starts_with(), ends_with()
What_did_this_do <- select(PKdata, ends_with("V"))
head(What_did_this_do)
## DV HIV
## 1 0.00000 0
## 2 0.36910 0
## 3 0.49766 0
## 4 0.55543 0
## 5 0.28532 0
## 6 0.18520 0
#Write your answer commented out below:
#removed all columns that end with V
#Can you use helpers to create the same deidentified dataset?
deidentified <- select(PKdata, -starts_with("PT"))
head(deidentified)
## ID TIME IPRED DV AMT HT BW SEX HIV IWRES CWRES PRED
## 1 1 0 0.00000 0.00000 300 69 78.298 1 0 NA 0 NA
## 2 1 1 0.35767 0.36910 0 69 78.298 1 0 -10 0 0.36325
## 3 1 2 0.45474 0.49766 0 69 78.298 1 0 -10 0 0.47078
## 4 1 4 0.43277 0.55543 0 69 78.298 1 0 -10 0 0.46926
## 5 1 8 0.29634 0.28532 0 69 78.298 1 0 -10 0 0.35684
## 6 1 12 0.19763 0.18520 0 69 78.298 1 0 -10 0 0.26497
## RES WRES DOSE
## 1 0.0000000 0 300
## 2 0.0058536 0 300
## 3 0.0268820 0 300
## 4 0.0861620 0 300
## 5 -0.0715120 0 300
## 6 -0.0797670 0 300
Filtering
#You can filter out specific rows using filter()
#For example, selecting all patients who received a specific dose
Dosed_600 <- filter(PKdata, DOSE == 600)
head(Dosed_600)
## ID PT_BIRTHDATE PT_USUBJID PT_ADDRESS TIME IPRED
## 1 301 09/20/1982 XDP5607 1296 San Francisco Drive 0 0.00000
## 2 301 09/20/1982 XDP5607 1296 San Francisco Drive 1 0.73153
## 3 301 09/20/1982 XDP5607 1296 San Francisco Drive 2 0.95626
## 4 301 09/20/1982 XDP5607 1296 San Francisco Drive 4 0.97317
## 5 301 09/20/1982 XDP5607 1296 San Francisco Drive 8 0.77547
## 6 301 09/20/1982 XDP5607 1296 San Francisco Drive 12 0.60406
## DV AMT HT BW SEX HIV IWRES CWRES PRED RES WRES DOSE
## 1 0.00000 600 70 64.202 1 0 NA 0 NA 0.000000 0 600
## 2 0.75396 0 70 64.202 1 0 -10 0 0.72684 0.027128 0 600
## 3 1.08000 0 70 64.202 1 0 -10 0 0.94255 0.137470 0 600
## 4 1.00730 0 70 64.202 1 0 -10 0 0.94086 0.066440 0 600
## 5 0.73731 0 70 64.202 1 0 -10 0 0.71775 0.019561 0 600
## 6 0.60851 0 70 64.202 1 0 -10 0 0.53471 0.073796 0 600
#You can use any logical function in R to designate your filter criteria
# > greater than, <= greater equal than
# < less than, <= lesser equal than
# ! not
# & and
# | or
#Can you filter for patients with body weight greater or equal than 45kg?
WT.GT.45 <- filter(PKdata, BW >= 45)
head(WT.GT.45)
## ID PT_BIRTHDATE PT_USUBJID PT_ADDRESS TIME IPRED DV
## 1 1 09/20/1982 XDP5409 7385 San Francisco Drive 0 0.00000 0.00000
## 2 1 09/20/1982 XDP5409 7385 San Francisco Drive 1 0.35767 0.36910
## 3 1 09/20/1982 XDP5409 7385 San Francisco Drive 2 0.45474 0.49766
## 4 1 09/20/1982 XDP5409 7385 San Francisco Drive 4 0.43277 0.55543
## 5 1 09/20/1982 XDP5409 7385 San Francisco Drive 8 0.29634 0.28532
## 6 1 09/20/1982 XDP5409 7385 San Francisco Drive 12 0.19763 0.18520
## AMT HT BW SEX HIV IWRES CWRES PRED RES WRES DOSE
## 1 300 69 78.298 1 0 NA 0 NA 0.0000000 0 300
## 2 0 69 78.298 1 0 -10 0 0.36325 0.0058536 0 300
## 3 0 69 78.298 1 0 -10 0 0.47078 0.0268820 0 300
## 4 0 69 78.298 1 0 -10 0 0.46926 0.0861620 0 300
## 5 0 69 78.298 1 0 -10 0 0.35684 -0.0715120 0 300
## 6 0 69 78.298 1 0 -10 0 0.26497 -0.0797670 0 300
#You can also use & or | to create more complex logical functions
#Let's filter for patients that received a dose greater than or equal to 600 and HIV+
Dosed600up_HIV <- filter(PKdata, DOSE >= 600 & HIV == 1)
head(Dosed600up_HIV)
## ID PT_BIRTHDATE PT_USUBJID PT_ADDRESS TIME IPRED
## 1 315 09/20/1982 XDP4217 7593 San Francisco Drive 0 0.00000
## 2 315 09/20/1982 XDP4217 7593 San Francisco Drive 1 0.73035
## 3 315 09/20/1982 XDP4217 7593 San Francisco Drive 2 0.95281
## 4 315 09/20/1982 XDP4217 7593 San Francisco Drive 4 0.96497
## 5 315 09/20/1982 XDP4217 7593 San Francisco Drive 8 0.76058
## 6 315 09/20/1982 XDP4217 7593 San Francisco Drive 12 0.58588
## DV AMT HT BW SEX HIV IWRES CWRES PRED RES WRES DOSE
## 1 0.00000 600 64 65.279 1 1 NA 0 NA 0.00000000 0 600
## 2 0.73102 0 64 65.279 1 1 -10 0 0.71913 0.01188800 0 600
## 3 0.91986 0 64 65.279 1 1 -10 0 0.92031 -0.00044695 0 600
## 4 1.05310 0 64 65.279 1 1 -10 0 0.88979 0.16336000 0 600
## 5 0.84030 0 64 65.279 1 1 -10 0 0.63150 0.20880000 0 600
## 6 0.54619 0 64 65.279 1 1 -10 0 0.43691 0.10928000 0 600
#Challenge: Filter for patients who received the lowest or highest dose, and is female with HIV or male
#Remember there are always many ways to code something :)
Challenge <- filter(PKdata, DOSE != 600 & (HIV == 1 | SEX == 1))
head(Challenge)
## ID PT_BIRTHDATE PT_USUBJID PT_ADDRESS TIME IPRED DV
## 1 1 09/20/1982 XDP5409 7385 San Francisco Drive 0 0.00000 0.00000
## 2 1 09/20/1982 XDP5409 7385 San Francisco Drive 1 0.35767 0.36910
## 3 1 09/20/1982 XDP5409 7385 San Francisco Drive 2 0.45474 0.49766
## 4 1 09/20/1982 XDP5409 7385 San Francisco Drive 4 0.43277 0.55543
## 5 1 09/20/1982 XDP5409 7385 San Francisco Drive 8 0.29634 0.28532
## 6 1 09/20/1982 XDP5409 7385 San Francisco Drive 12 0.19763 0.18520
## AMT HT BW SEX HIV IWRES CWRES PRED RES WRES DOSE
## 1 300 69 78.298 1 0 NA 0 NA 0.0000000 0 300
## 2 0 69 78.298 1 0 -10 0 0.36325 0.0058536 0 300
## 3 0 69 78.298 1 0 -10 0 0.47078 0.0268820 0 300
## 4 0 69 78.298 1 0 -10 0 0.46926 0.0861620 0 300
## 5 0 69 78.298 1 0 -10 0 0.35684 -0.0715120 0 300
## 6 0 69 78.298 1 0 -10 0 0.26497 -0.0797670 0 300
Arranging
#arrange() will reorder rows or columns according to your specifications (default from low to high)
#For example in PK data we will often organize our rows by patient ID, then by time.
PKdata <- arrange(PKdata, ID, TIME)
Putting it all together with Pipes %>%
#Pipes are used to connect the output of one function to the input of another
#Let's deidentify our dataset, filter for 600mg dose, and arrange by id and time all together
Dose_600 <- PKdata %>%
select( -starts_with("PT")) %>%
filter(DOSE == 600) %>%
arrange(ID, TIME)
#Try this one for yourself!
#Let's try filtering for HIV+ or body weight less than 45kg, then...
#we want to see the trough levels of each patient (defined as the 24hr timepoint)
Trough <- PKdata %>%
select( -starts_with("PT")) %>%
filter(HIV == 1 | BW < 45) %>%
filter(TIME == 24)
View(Trough)
Mutate
#mutate() can create new columns or alter existing ones
#For example, BMI is often calculated from HT and WT
#BMI = weight(kg)/height(m)^2
PKdata <- PKdata %>%
mutate(BMI = BW/(HT/39.37)^2) #are the units correct? check the data dictionary!
#Now you try making a new column and converting DV from mg/L into micromoles!
#The drug used here is remdesivir
PKdata<- PKdata %>%
mutate(Conc_uM = DV/1000/602.6*1000000) #mg/L *1g/1000mg * 1mol/602.6g * 1000000uM/1M
Our first Plot!! ggplot2 cheatsheet: https://raw.githubusercontent.com/rstudio/cheatsheets/master/data-visualization-2.1.pdf
ggplot(PKdata) +
geom_point(aes(x=TIME, y=DV)) ##What are we trying to plot with a PK profile?
#Do you like lines better?
ggplot(PKdata) +
geom_line(aes(x=TIME, y=DV,group=factor(ID)),alpha = 0.1, linetype=2,size=0.1)
#Let's put them on the same graph with a +
ggplot(PKdata) +
geom_line(aes(x=TIME, y=DV,group=factor(ID)),alpha = 0.1, linetype=2,size=0.1) +
geom_point(aes(x=TIME, y=DV))
Graphing Subpopulations
ggplot(PKdata) +
geom_point(aes(x=TIME, y=DV, color = as.factor(DOSE))) +
geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(DOSE)),alpha = 0.1, linetype=2,size=0.1)
Another way to stratify
#It's still a little too cluttered
ggplot(PKdata) +
geom_point(aes(x=TIME, y=DV, color = as.factor(DOSE))) +
geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(DOSE)),alpha = 0.1, linetype=2,size=0.1) +
facet_wrap(.~DOSE)
Let’s try some on your own
#Copy and paste from above, and let's plot stratified by HIV status
ggplot(PKdata) +
geom_point(aes(x=TIME, y=DV, color = as.factor(HIV))) +
geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(HIV)),alpha = 0.1, linetype=2,size=0.1) +
facet_wrap(.~DOSE)
#Now let's try adding 2 stratifications: color by HIV and facet_wrap by Dose~SEX. Then try the other way around!
ggplot(PKdata) +
geom_point(aes(x=TIME, y=DV, color = as.factor(HIV))) +
geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(HIV)),alpha = 0.1, linetype=2,size=0.1) +
facet_wrap(DOSE~SEX)
ggplot(PKdata) +
geom_point(aes(x=TIME, y=DV, color = as.factor(SEX))) +
geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(SEX)),alpha = 0.1, linetype=2,size=0.1) +
facet_wrap(DOSE~~HIV)
#Challenge: Plot stratified by body weight, one color for above 45kg and another color for below. Make sure to have a legend :)
#Hint: you will need to manipulate your dataframe
PKdata <- PKdata %>%
mutate(WT_FLAG = ifelse(BW > 45, 1, 0))
ggplot(PKdata) +
geom_point(aes(x=TIME, y=DV, color = as.factor(WT_FLAG))) +
geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(WT_FLAG)),alpha = 0.1, linetype=2,size=0.1) +
facet_wrap(.~DOSE)
Summarise Dplyr can be used for simple summary statistics
#You can use summarise to calculate means, ranges, standard deviations, quartiles, etc.
Summary_stats <- PKdata %>%
summarise(c_mean = mean(DV),
c_stdev = sd(DV),
BW_mean = mean(BW),
BW_upper = quantile(BW, probs = 0.975),
BW_lower = quantile(BW, probs = 0.225))
View(Summary_stats)
Combined with group_by(), summarise is a force to be reckoned with..
#Let's calculate subpopulation summary statistics
#For example, let's calculate body weight summary statistics by SEX
BW_by_SEX <- PKdata %>%
group_by(SEX) %>%
summarise(BW_mean = mean(BW),
BW_upper = quantile(BW, probs = 0.975),
BW_lower = quantile(BW, probs = 0.225))
## `summarise()` ungrouping output (override with `.groups` argument)
#Let's plot it!
ggplot(BW_by_SEX) +
geom_point(aes(x=SEX, y=BW_mean)) +
geom_errorbar((aes(x=SEX, ymin=BW_lower, ymax=BW_upper)))
#An easier way to do it in just ggplot, there are always multiple ways to do the same thing
ggplot(PKdata) +
geom_boxplot(aes(x=as.factor(SEX), y=BW))
Let’s calculate the median PK profile
#Let's start with calculating median and percentile profiles
Med <- PKdata %>%
group_by(TIME) %>%
summarise(median = median(DV),
upper = quantile(DV, probs=0.975), #Upper 95th percentile
lower = quantile(DV, probs=0.225)) #Lower 95th percentile
## `summarise()` ungrouping output (override with `.groups` argument)
#Let's plot it
ggplot(Med) +
geom_line(aes(x=TIME, y=median)) +
geom_ribbon(aes(x=TIME, ymin=lower, ymax=upper), alpha=0.3)
Let’s compare median PK profiles of subpopulations
#Let's do the same thing but subset by DOSE
DOSE_Med <- PKdata %>%
group_by(DOSE,TIME) %>%
summarise(median = median(DV),
upper = quantile(DV, probs=0.975), #Upper 95th percentile
lower = quantile(DV, probs=0.225)) #Lower 95th percentile
## `summarise()` regrouping output by 'DOSE' (override with `.groups` argument)
#Let's plot it
ggplot(DOSE_Med) +
geom_line(aes(x=TIME, y=median, color = as.factor(DOSE))) +
geom_ribbon(aes(x=TIME, ymin=lower, ymax=upper, fill = as.factor(DOSE)), alpha=0.3)
Let’s try another one…
#Median PK profiles, subset by HIV
HIV_Med <- PKdata %>%
group_by(HIV,TIME) %>%
summarise(median = median(DV),
upper = quantile(DV, probs=0.975), #Upper 95th percentile
lower = quantile(DV, probs=0.225)) #Lower 95th percentile
## `summarise()` regrouping output by 'HIV' (override with `.groups` argument)
#Let's plot it
ggplot(HIV_Med) +
geom_line(aes(x=TIME, y=median, color = as.factor(HIV))) +
geom_ribbon(aes(x=TIME, ymin=lower, ymax=upper, fill = as.factor(HIV)), alpha=0.3)
This doesn’t really work…the different dosing levels make the stratification cluttered
#We need to subset by DOSE and HIV
#Prepare the dataset yourself
HIV_Med <- PKdata %>%
group_by(DOSE, HIV,TIME) %>%
summarise(median = median(DV),
upper = quantile(DV, probs=0.975), #Upper 95th percentile
lower = quantile(DV, probs=0.225)) #Lower 95th percentile
## `summarise()` regrouping output by 'DOSE', 'HIV' (override with `.groups` argument)
#Let's plot it
ggplot(HIV_Med) +
geom_line(aes(x=TIME, y=median, color = as.factor(HIV))) +
geom_ribbon(aes(x=TIME, ymin=lower, ymax=upper, fill = as.factor(HIV)), alpha=0.3) +
facet_wrap(.~DOSE)